use strict;
use warnings;

use XML::Simple qw(:strict);

my $inputFilename = shift @ARGV;
my $outputFilename = "$inputFilename.queries";

open (OUTF, "> $outputFilename") or 
  die "File cannot be opened for writing - $outputFilename";


my $topics  = XMLin($inputFilename, ForceArray => 1, KeyAttr => {});

foreach my $topic (@{$topics->{topic}}) {
  my $topic_description = $topic->{description}->[0];
  &trimWhitespaceBothSides(\$topic_description);
  &reliablyRemoveEol(\$topic_description);
  &removeRepeatedWhitespace(\$topic_description);
  my $topic_number = $topic->{number};
  print OUTF "topic-$topic_number-description\t$topic_description\n" ;

  foreach my $subtopic (@{$topic->{subtopic}}) {
    my $subtopic_content = $subtopic->{content};
    # occasionally, subtopic content might be empty, 
    # hence need to check if it's defined first
    if (defined($subtopic_content) && length($subtopic_content) != 0) {
      &trimWhitespaceBothSides(\$subtopic_content);
      &reliablyRemoveEol(\$subtopic_content);
      &removeRepeatedWhitespace(\$subtopic_content);
    } else {
      $subtopic_content = "(empty subtopic)";
    }
    my $subtopic_number = $subtopic->{number};
    print OUTF "topic-$topic_number-subtopic-$subtopic_number\t$subtopic_content\n";
  }
}

close(OUTF);

sub trimWhitespaceBothSides(\$) {
  my ($stringRef) = @_;
    
  # remove leading whitespace
  $$stringRef =~ s/^\s*//;
  # remove trailing whitespace
  $$stringRef =~ s/\s*$//;
}

sub reliablyRemoveEol(\$) {
  my ($stringRef) = @_;
    
  my $char_0D = chr(0x0D);
  my $char_0A = chr(0x0A);

  # globally remove all end-of-line chars
  $$stringRef =~ s/$char_0D/ /g;
  $$stringRef =~ s/$char_0A/ /g;
}

sub removeRepeatedWhitespace(\$) {
  my ($stringRef) = @_;

  $$stringRef =~ s/\s+/ /g;
}

